import pandoc
import nbconvert
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
file = '/Users/gretastojanovic/Desktop/DS/Python/RMIT_Python/Assignment_1/A1_HR_Employee_Data.csv'
df = pd.read_csv(file)
df = pd.DataFrame(df)
df.head()
| EmployeeID | Age | Resigned | BusinessTravel | BusinessUnit | EducationLevel | Gender | JobSatisfaction | MaritalStatus | MonthlyIncome | ... | PercentSalaryHike | PerformanceRating | AverageWeeklyHoursWorked | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7912 | 41 | Yes | Travel_Rarely | Sales | 2.0 | Female | 4.0 | Single | 5993.0 | ... | 11 | 3 | 63.0 | 8 | 0 | 1.0 | 6 | 4 | 0 | 5 |
| 1 | 1520 | 49 | No | Travel_Frequently | Consultants | 1.0 | Male | 2.0 | Married | 5130.0 | ... | 23 | 4 | 40.0 | 10 | 3 | 3.0 | 10 | 7 | 1 | 7 |
| 2 | 1488 | 37 | Yes | Travel_Rarely | Consultants | 2.0 | Male | 3.0 | Single | 2090.0 | ... | 15 | 3 | 50.0 | 7 | 3 | 3.0 | 0 | 0 | 0 | 0 |
| 3 | 2535 | 33 | No | Travel_Frequently | Consultants | 4.0 | Female | 3.0 | Married | 2909.0 | ... | 11 | 3 | 48.0 | 8 | 3 | 3.0 | 8 | 7 | 3 | 0 |
| 4 | 4582 | 27 | No | Travel_Rarely | Consultants | 1.0 | Male | 2.0 | Married | 3468.0 | ... | 12 | 3 | 40.0 | 6 | 3 | 3.0 | 2 | 2 | 2 | 2 |
5 rows × 22 columns
#Confirming that the loaded data is equivalent to the data in the source (CSV) file
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1482 entries, 0 to 1481 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 EmployeeID 1482 non-null int64 1 Age 1482 non-null object 2 Resigned 1480 non-null object 3 BusinessTravel 1482 non-null object 4 BusinessUnit 1482 non-null object 5 EducationLevel 1481 non-null float64 6 Gender 1482 non-null object 7 JobSatisfaction 1481 non-null float64 8 MaritalStatus 1482 non-null object 9 MonthlyIncome 1479 non-null float64 10 NumCompaniesWorked 1482 non-null int64 11 OverTime 1479 non-null object 12 PercentSalaryHike 1482 non-null int64 13 PerformanceRating 1482 non-null int64 14 AverageWeeklyHoursWorked 1482 non-null float64 15 TotalWorkingYears 1482 non-null int64 16 TrainingTimesLastYear 1482 non-null int64 17 WorkLifeBalance 1481 non-null float64 18 YearsAtCompany 1482 non-null int64 19 YearsInRole 1482 non-null int64 20 YearsSinceLastPromotion 1482 non-null int64 21 YearsWithCurrManager 1482 non-null int64 dtypes: float64(5), int64(10), object(7) memory usage: 254.8+ KB
The length of the dataframe matches the number of rows in the csv. The columns are also the same.
#check for missing values
df.isna().sum()
EmployeeID 0 Age 0 Resigned 2 BusinessTravel 0 BusinessUnit 0 EducationLevel 1 Gender 0 JobSatisfaction 1 MaritalStatus 0 MonthlyIncome 3 NumCompaniesWorked 0 OverTime 3 PercentSalaryHike 0 PerformanceRating 0 AverageWeeklyHoursWorked 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 1 YearsAtCompany 0 YearsInRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
#get object columns
objectcols = list(df.select_dtypes(['object']).columns)
objectcols
['Age', 'Resigned', 'BusinessTravel', 'BusinessUnit', 'Gender', 'MaritalStatus', 'OverTime']
#get unique values for each column in the dataframe to idenitfy typos and other issues that need to be addressed
def show_uniques(df):
for col in df:
if col in objectcols:
print(col)
print(df[col].unique())
print("\n")
show_uniques(df)
Age ['41' '49' '37' '33' '27' '32' '59' '30' '38' '36' '35' '29' '31' '34' '28' '22' '53' '24' '21' '42' '44' '46' '39' '43' '50' '26' '48' '55' '45' '56' '23' '51' '40' '54' '58' '20' '25' '19' '57' '52' '47' '18' '60' '36a'] Resigned ['Yes' 'No' 'Y' nan 'NO' 'N' 'no'] BusinessTravel ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel' 'Travels_Rarely' 'TRAVEL_RARELY' 'rarely_travel'] BusinessUnit ['Sales' 'Consultants' 'Business Operations' 'Female'] Gender ['Female' 'Male' ' Male' ' Female' 'Sales' 'MMale' 'M' 'male'] MaritalStatus ['Single' 'Married' 'Divorced' ' Divorced' ' Single' 'D'] OverTime ['Yes' 'No' nan]
#cleaning and fixing errors in the object variables
def cleanup(df):
objectcols = list(df.select_dtypes(['object']).columns)
#converting object variables to upper case
def uppercase(df):
for col in objectcols:
df[col] = df[col].str.upper()
uppercase(df)
#removing white space from strings
def stripwhitespace(df):
for col in objectcols:
df[col] = df[col].str.strip()
stripwhitespace(df)
def show_uniques(df):
for col in df:
if col in objectcols:
print(col)
print(df[col].unique())
print("\n")
show_uniques(df)
cleanup(df)
Age ['41' '49' '37' '33' '27' '32' '59' '30' '38' '36' '35' '29' '31' '34' '28' '22' '53' '24' '21' '42' '44' '46' '39' '43' '50' '26' '48' '55' '45' '56' '23' '51' '40' '54' '58' '20' '25' '19' '57' '52' '47' '18' '60' '36A'] Resigned ['YES' 'NO' 'Y' nan 'N'] BusinessTravel ['TRAVEL_RARELY' 'TRAVEL_FREQUENTLY' 'NON-TRAVEL' 'TRAVELS_RARELY' 'RARELY_TRAVEL'] BusinessUnit ['SALES' 'CONSULTANTS' 'BUSINESS OPERATIONS' 'FEMALE'] Gender ['FEMALE' 'MALE' 'SALES' 'MMALE' 'M'] MaritalStatus ['SINGLE' 'MARRIED' 'DIVORCED' 'D'] OverTime ['YES' 'NO' nan]
#consolidating duplicate responses eg "Y" and "YES" to "YES
def consolidate_duplicates(df):
d = {'36A': 36, 'Y': 'YES', 'N': 'NO', 'TRAVELS_RARELY': 'TRAVEL_RARELY', 'RARELY_TRAVEL': 'TRAVEL_RARELY', 'MMALE':'MALE', 'M':'MALE', 'D': 'DIVORCED'}
for col in df:
if col in objectcols:
df[col] = df[col].replace(d)
consolidate_duplicates(df)
#remove erroneous responses that do not belong to column
df.drop(df[df.BusinessUnit == 'FEMALE'].index, inplace = True)
df.drop(df[df.Gender == 'SALES'].index, inplace = True)
df.drop(df[df.Gender == 'MMALE'].index, inplace = True)
#Converting Age to type integer
df.Age = df.Age.astype(int)
df.Age.dtypes
dtype('int64')
contains_na = df.columns[df.isna().any()].tolist()
contains_na
['Resigned', 'EducationLevel', 'JobSatisfaction', 'MonthlyIncome', 'OverTime', 'WorkLifeBalance']
for col in df.columns:
if col not in objectcols:
if col in contains_na:
df[col].fillna(df[col].mean(), inplace=True)
if col == 'JobSatisfaction':
df[col] = df[col].astype(int)
df = df.fillna(method='ffill')
#confirming null data has been corrected and there are no longer missing values
null_data = df[df.isnull().any(axis=1)]
null_data
| EmployeeID | Age | Resigned | BusinessTravel | BusinessUnit | EducationLevel | Gender | JobSatisfaction | MaritalStatus | MonthlyIncome | ... | PercentSalaryHike | PerformanceRating | AverageWeeklyHoursWorked | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInRole | YearsSinceLastPromotion | YearsWithCurrManager |
|---|
0 rows × 22 columns
#check uniques are now corrected
def get_new_uniques(df):
objectcols = list(df.select_dtypes(['object']).columns)
def show_uniques(df):
for col in df:
if col in objectcols:
print(col)
print(df[col].unique())
print("\n")
#return df[col].unique()
show_uniques(df)
get_new_uniques(df)
Resigned ['YES' 'NO'] BusinessTravel ['TRAVEL_RARELY' 'TRAVEL_FREQUENTLY' 'NON-TRAVEL'] BusinessUnit ['SALES' 'CONSULTANTS' 'BUSINESS OPERATIONS'] Gender ['FEMALE' 'MALE'] MaritalStatus ['SINGLE' 'MARRIED' 'DIVORCED'] OverTime ['YES' 'NO']
#confirming that missing values have been removed
df.isna().sum()
EmployeeID 0 Age 0 Resigned 0 BusinessTravel 0 BusinessUnit 0 EducationLevel 0 Gender 0 JobSatisfaction 0 MaritalStatus 0 MonthlyIncome 0 NumCompaniesWorked 0 OverTime 0 PercentSalaryHike 0 PerformanceRating 0 AverageWeeklyHoursWorked 0 TotalWorkingYears 0 TrainingTimesLastYear 0 WorkLifeBalance 0 YearsAtCompany 0 YearsInRole 0 YearsSinceLastPromotion 0 YearsWithCurrManager 0 dtype: int64
numerics = list(df.select_dtypes(['int64', 'float64']).columns)
#remove employee id from numerics analysis
dfn1 = df[numerics]
dfn1 = dfn1.iloc[:,1:]
dfn1.describe()
| Age | EducationLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | AverageWeeklyHoursWorked | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 | 1481.000000 |
| mean | 36.877110 | 2.914189 | 2.731938 | 6488.811908 | 2.692775 | 15.193113 | 3.153950 | 43.284267 | 11.245780 | 2.797434 | 2.762838 | 6.998650 | 4.230925 | 2.193113 | 4.118839 |
| std | 9.133012 | 1.023409 | 1.100546 | 4694.727846 | 2.500704 | 3.680369 | 0.362889 | 10.723406 | 7.770376 | 1.286949 | 0.706034 | 6.114117 | 3.620048 | 3.226450 | 3.564377 |
| min | 18.000000 | 1.000000 | 1.000000 | 1009.000000 | 0.000000 | 0.000000 | 2.000000 | 40.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | 2.000000 | 2.000000 | 2911.000000 | 1.000000 | 12.000000 | 3.000000 | 40.000000 | 6.000000 | 2.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 36.000000 | 3.000000 | 3.000000 | 4907.000000 | 2.000000 | 14.000000 | 3.000000 | 40.000000 | 10.000000 | 3.000000 | 3.000000 | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 43.000000 | 4.000000 | 4.000000 | 8321.000000 | 4.000000 | 18.000000 | 3.000000 | 47.000000 | 15.000000 | 3.000000 | 3.000000 | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | 5.000000 | 4.000000 | 19999.000000 | 9.000000 | 25.000000 | 4.000000 | 400.000000 | 40.000000 | 6.000000 | 4.000000 | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
#subset numeric data to columns with max value <= 70 and max value > 70 to allow for comparison of multiple columns via boxplots side by side)
u_or_eq_to_seventy = []
over_seventy = []
for col in dfn1:
if max(dfn1[col]) <= 70:
u_or_eq_to_seventy.append(col)
else: over_seventy.append(col)
#create dataframe uf to represent the columns with max value less than or equal to 70
us = dfn1[u_or_eq_to_seventy]
ax = sns.boxplot(data=us, orient="h", palette="Set2")
#create dataframe of to represent remaining columns with max values over 70
os = df[over_seventy]
fig = px.box(os, y = "MonthlyIncome",
labels = {"MonthlyIncome": "Monthly Income"})
fig.show()
The box plot above shows a high number of Monthly Income values as outliers however this could be due to the split between executive management and staff so will not be removed at present.
#plotting AverageWeeklyHoursWorked
fig = px.box(os, y = "AverageWeeklyHoursWorked",
labels = {"AverageWeeklyHoursWorked": "Average Weekly Hours Worked"})
fig.show()
The above plot for Average Weekly Hours Worked shows an outlier of 400 which is more than the number of hours in the week and will now be removed.
#dropping row that contains value of 400
df.drop(df[df.AverageWeeklyHoursWorked == 400].index, inplace = True)
#plotting again without outlier value
fig = px.box(df, y = "AverageWeeklyHoursWorked",
labels = {"AverageWeeklyHoursWorked": "Average Weekly Hours Worked"})
fig.show()
#export dataframe to csv
#df.to_csv('/Users/greta/iCloud/Documents/DataScience/RMIT_MASTERS/RMIT_Python/s3914796.csv')
#plotting the Resigned variable as a histogram
fig = hist = px.histogram(df, x = "Resigned",
labels = {"Resigned": "Resignation Status", "count": "Number of Employees"},
title = "Employee Resignation Status (Yes vs No)")
fig.show()
#plotting the JobSatisfaction variable as a histogram
#https://plotly.com/python/axes/
jobs = df['JobSatisfaction'].astype(int)
jobs = pd.DataFrame(jobs)
jobs['totalresponses'] = len(jobs['JobSatisfaction'])
jobs['count'] = 1
js_tally = jobs.groupby(by=["JobSatisfaction"]).sum()
js_tally["JobSatisfaction"] = js_tally.index
js_tally['total']= js_tally["count"].sum()
js_tally['propn'] = js_tally["count"] / js_tally["total"]
js_tally
| totalresponses | count | JobSatisfaction | total | propn | |
|---|---|---|---|---|---|
| JobSatisfaction | |||||
| 1 | 427720 | 289 | 1 | 1480 | 0.195270 |
| 2 | 415880 | 281 | 2 | 1480 | 0.189865 |
| 3 | 663040 | 448 | 3 | 1480 | 0.302703 |
| 4 | 683760 | 462 | 4 | 1480 | 0.312162 |
fig = px.bar(js_tally, x = "JobSatisfaction", y = "propn", color = 'JobSatisfaction', barmode = "group",
labels = {"JobSatisfaction": "Job Satisfaction"},
title = "Employee Job Satisfaction Survey Results")
fig = fig.update_xaxes(type='category')
fig.show()
#plotting the AverageWeeklyHoursWorked variable as a boxplot
#https://plotly.com/python/box-plots/
fig = px.box(df, y = "AverageWeeklyHoursWorked",
labels = {"AverageWeeklyHoursWorked": "Average Weekly Hours Worked"},
title = "Boxplot of Average Weekly Hours Worked")
fig.show()
As a starting point, to get an understanding of the relationships between the variables, the correlation is plotted with a correlation plot below. The object variables are transformed using one hot encoding to allow all variables to appear in the correlation plot:
#using one hot encoding to get dummy variables for categoricals
onehot = pd.get_dummies(df, prefix=['Resigned', 'BusinessTravel',
'BusinessUnit',
'Gender',
'MaritalStatus',
'OverTime'])
onehot = onehot.iloc[:,1:]
onehot.head()
| Age | EducationLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | AverageWeeklyHoursWorked | TotalWorkingYears | TrainingTimesLastYear | ... | BusinessUnit_BUSINESS OPERATIONS | BusinessUnit_CONSULTANTS | BusinessUnit_SALES | Gender_FEMALE | Gender_MALE | MaritalStatus_DIVORCED | MaritalStatus_MARRIED | MaritalStatus_SINGLE | OverTime_NO | OverTime_YES | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41 | 2.0 | 4 | 5993.0 | 8 | 11 | 3 | 63.0 | 8 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
| 1 | 49 | 1.0 | 2 | 5130.0 | 1 | 23 | 4 | 40.0 | 10 | 3 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
| 2 | 37 | 2.0 | 3 | 2090.0 | 6 | 15 | 3 | 50.0 | 7 | 3 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 3 | 33 | 4.0 | 3 | 2909.0 | 1 | 11 | 3 | 48.0 | 8 | 3 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 4 | 27 | 1.0 | 2 | 3468.0 | 9 | 12 | 3 | 40.0 | 6 | 3 | ... | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 |
5 rows × 30 columns
#get a correlation plot to show relationships between all variables
corr = onehot.corr()
corr.style.background_gradient(cmap='coolwarm')
| Age | EducationLevel | JobSatisfaction | MonthlyIncome | NumCompaniesWorked | PercentSalaryHike | PerformanceRating | AverageWeeklyHoursWorked | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInRole | YearsSinceLastPromotion | YearsWithCurrManager | Resigned_NO | Resigned_YES | BusinessTravel_NON-TRAVEL | BusinessTravel_TRAVEL_FREQUENTLY | BusinessTravel_TRAVEL_RARELY | BusinessUnit_BUSINESS OPERATIONS | BusinessUnit_CONSULTANTS | BusinessUnit_SALES | Gender_FEMALE | Gender_MALE | MaritalStatus_DIVORCED | MaritalStatus_MARRIED | MaritalStatus_SINGLE | OverTime_NO | OverTime_YES | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1.000000 | 0.208800 | -0.007701 | 0.497024 | 0.299535 | 0.003198 | -0.003490 | 0.023989 | 0.681226 | -0.019700 | -0.021413 | 0.312299 | 0.213238 | 0.214865 | 0.203192 | 0.156861 | -0.156861 | -0.008685 | -0.025134 | 0.027443 | 0.013340 | 0.018257 | -0.024868 | 0.035255 | -0.035255 | 0.036180 | 0.083140 | -0.120988 | -0.031085 | 0.031085 |
| EducationLevel | 0.208800 | 1.000000 | -0.011566 | 0.093747 | 0.129916 | -0.009828 | -0.026126 | -0.016104 | 0.148880 | -0.027137 | 0.011809 | 0.069614 | 0.061505 | 0.055955 | 0.070828 | 0.029490 | -0.029490 | 0.006695 | -0.006508 | 0.001143 | 0.008185 | -0.018363 | 0.015371 | 0.015143 | -0.015143 | -0.000975 | -0.003193 | 0.004278 | 0.020566 | -0.020566 |
| JobSatisfaction | -0.007701 | -0.011566 | 1.000000 | -0.008037 | -0.054452 | 0.015846 | 0.001962 | 0.039952 | -0.023345 | -0.005473 | -0.017375 | -0.006343 | -0.004731 | -0.019911 | -0.029465 | 0.103884 | -0.103884 | 0.019279 | 0.028004 | -0.036978 | -0.025660 | -0.001730 | 0.013245 | -0.034995 | 0.034995 | -0.016284 | -0.012484 | 0.027827 | -0.022657 | 0.022657 |
| MonthlyIncome | 0.497024 | 0.093747 | -0.008037 | 1.000000 | 0.148716 | -0.024909 | -0.015655 | -0.026491 | 0.771744 | -0.020724 | 0.030047 | 0.513346 | 0.362107 | 0.342721 | 0.342094 | 0.159610 | -0.159610 | -0.017478 | -0.032268 | 0.039451 | 0.005548 | -0.065200 | 0.065073 | 0.030716 | -0.030716 | 0.032694 | 0.055860 | -0.088754 | -0.007805 | 0.007805 |
| NumCompaniesWorked | 0.299535 | 0.129916 | -0.054452 | 0.148716 | 1.000000 | -0.010863 | -0.016426 | -0.016790 | 0.236144 | -0.067402 | -0.004696 | -0.119380 | -0.092479 | -0.038828 | -0.111840 | -0.041931 | 0.041931 | 0.002964 | -0.036901 | 0.029813 | 0.015743 | 0.020984 | -0.028766 | 0.038546 | -0.038546 | 0.040735 | -0.016280 | -0.018877 | 0.020671 | -0.020671 |
| PercentSalaryHike | 0.003198 | -0.009828 | 0.015846 | -0.024909 | -0.010863 | 1.000000 | 0.770972 | 0.020969 | -0.016182 | -0.005681 | -0.006904 | -0.032424 | 0.001312 | -0.017385 | -0.009938 | 0.012571 | -0.012571 | 0.035010 | -0.004273 | -0.019659 | -0.015773 | 0.024427 | -0.018267 | -0.001496 | 0.001496 | -0.022812 | 0.022302 | -0.003509 | 0.002724 | -0.002724 |
| PerformanceRating | -0.003490 | -0.026126 | 0.001962 | -0.015655 | -0.016426 | 0.770972 | 1.000000 | 0.024617 | 0.005642 | -0.018497 | -0.000337 | 0.003422 | 0.034100 | 0.018026 | 0.020395 | -0.001699 | 0.001699 | 0.016843 | 0.014372 | -0.023610 | 0.008964 | 0.026915 | -0.031885 | 0.013854 | -0.013854 | -0.011341 | 0.009535 | -0.000086 | -0.003784 | 0.003784 |
| AverageWeeklyHoursWorked | 0.023989 | -0.016104 | 0.039952 | -0.026491 | -0.016790 | 0.020969 | 0.024617 | 1.000000 | -0.011241 | -0.098157 | -0.205792 | -0.033725 | -0.052856 | -0.019069 | -0.060613 | -0.316071 | 0.316071 | -0.051245 | 0.030799 | 0.007632 | -0.011527 | 0.005317 | -0.000364 | 0.020590 | -0.020590 | 0.009316 | -0.009405 | 0.001750 | -0.895522 | 0.895522 |
| TotalWorkingYears | 0.681226 | 0.148880 | -0.023345 | 0.771744 | 0.236144 | -0.016182 | 0.005642 | -0.011241 | 1.000000 | -0.035613 | -0.000721 | 0.628840 | 0.460719 | 0.404242 | 0.459768 | 0.168522 | -0.168522 | -0.028078 | -0.012421 | 0.029420 | 0.005173 | 0.010337 | -0.013018 | 0.046452 | -0.046452 | 0.038571 | 0.053597 | -0.091569 | -0.015633 | 0.015633 |
| TrainingTimesLastYear | -0.019700 | -0.027137 | -0.005473 | -0.020724 | -0.067402 | -0.005681 | -0.018497 | -0.098157 | -0.035613 | 1.000000 | 0.027545 | 0.002953 | -0.006807 | -0.002715 | -0.004791 | 0.059679 | -0.059679 | -0.021495 | 0.006113 | 0.009064 | -0.043107 | -0.005280 | 0.024709 | 0.037716 | -0.037716 | 0.008200 | -0.030103 | 0.024847 | 0.077874 | -0.077874 |
| WorkLifeBalance | -0.021413 | 0.011809 | -0.017375 | 0.030047 | -0.004696 | -0.006904 | -0.000337 | -0.205792 | -0.000721 | 0.027545 | 1.000000 | 0.010000 | 0.048165 | 0.006617 | 0.001012 | 0.066478 | -0.066478 | 0.008966 | 0.012756 | -0.016966 | 0.042916 | -0.068954 | 0.052284 | 0.000967 | -0.000967 | -0.007277 | -0.008322 | 0.015365 | 0.028260 | -0.028260 |
| YearsAtCompany | 0.312299 | 0.069614 | -0.006343 | 0.513346 | -0.119380 | -0.032424 | 0.003422 | -0.033725 | 0.628840 | 0.002953 | 0.010000 | 1.000000 | 0.759042 | 0.618107 | 0.769200 | 0.132020 | -0.132020 | 0.008698 | 0.012297 | -0.016393 | 0.008821 | -0.032482 | 0.029715 | 0.030275 | -0.030275 | 0.027088 | 0.045632 | -0.072841 | 0.010267 | -0.010267 |
| YearsInRole | 0.213238 | 0.061505 | -0.004731 | 0.362107 | -0.092479 | 0.001312 | 0.034100 | -0.052856 | 0.460719 | -0.006807 | 0.048165 | 0.759042 | 1.000000 | 0.549387 | 0.715110 | 0.157658 | -0.157658 | 0.012787 | 0.001748 | -0.010031 | -0.038067 | -0.027790 | 0.045780 | 0.042457 | -0.042457 | 0.019581 | 0.066392 | -0.088328 | 0.029274 | -0.029274 |
| YearsSinceLastPromotion | 0.214865 | 0.055955 | -0.019911 | 0.342721 | -0.038828 | -0.017385 | 0.018026 | -0.019069 | 0.404242 | -0.002715 | 0.006617 | 0.618107 | 0.549387 | 1.000000 | 0.511830 | 0.029534 | -0.029534 | 0.019852 | 0.024460 | -0.034307 | -0.025815 | -0.020657 | 0.032922 | 0.027667 | -0.027667 | -0.005866 | 0.054600 | -0.053083 | 0.012202 | -0.012202 |
| YearsWithCurrManager | 0.203192 | 0.070828 | -0.029465 | 0.342094 | -0.111840 | -0.009938 | 0.020395 | -0.060613 | 0.459768 | -0.004791 | 0.001012 | 0.769200 | 0.715110 | 0.511830 | 1.000000 | 0.151786 | -0.151786 | 0.018253 | 0.013091 | -0.023447 | -0.029313 | -0.013334 | 0.026897 | 0.029741 | -0.029741 | 0.015617 | 0.032276 | -0.048368 | 0.040563 | -0.040563 |
| Resigned_NO | 0.156861 | 0.029490 | 0.103884 | 0.159610 | -0.041931 | 0.012571 | -0.001699 | -0.316071 | 0.168522 | 0.059679 | 0.066478 | 0.132020 | 0.157658 | 0.029534 | 0.151786 | 1.000000 | -1.000000 | 0.074636 | -0.113492 | 0.048012 | -0.013888 | 0.083771 | -0.080591 | 0.031381 | -0.031381 | 0.087443 | 0.092399 | -0.176509 | 0.245917 | -0.245917 |
| Resigned_YES | -0.156861 | -0.029490 | -0.103884 | -0.159610 | 0.041931 | -0.012571 | 0.001699 | 0.316071 | -0.168522 | -0.059679 | -0.066478 | -0.132020 | -0.157658 | -0.029534 | -0.151786 | -1.000000 | 1.000000 | -0.074636 | 0.113492 | -0.048012 | 0.013888 | -0.083771 | 0.080591 | -0.031381 | 0.031381 | -0.087443 | -0.092399 | 0.176509 | -0.245917 | 0.245917 |
| BusinessTravel_NON-TRAVEL | -0.008685 | 0.006695 | 0.019279 | -0.017478 | 0.002964 | 0.035010 | 0.016843 | -0.051245 | -0.028078 | -0.021495 | 0.008966 | 0.008698 | 0.012787 | 0.019852 | 0.018253 | 0.074636 | -0.074636 | 1.000000 | -0.162464 | -0.526728 | -0.006882 | -0.003577 | 0.006777 | -0.052396 | 0.052396 | 0.061997 | -0.045584 | -0.006511 | 0.036962 | -0.036962 |
| BusinessTravel_TRAVEL_FREQUENTLY | -0.025134 | -0.006508 | 0.028004 | -0.032268 | -0.036901 | -0.004273 | 0.014372 | 0.030799 | -0.012421 | 0.006113 | 0.012756 | 0.012297 | 0.001748 | 0.024460 | 0.013091 | -0.113492 | 0.113492 | -0.162464 | 1.000000 | -0.753167 | -0.010567 | 0.001885 | 0.002763 | 0.021897 | -0.021897 | 0.004856 | -0.030554 | 0.028304 | -0.029124 | 0.029124 |
| BusinessTravel_TRAVEL_RARELY | 0.027443 | 0.001143 | -0.036978 | 0.039451 | 0.029813 | -0.019659 | -0.023610 | 0.007632 | 0.029420 | 0.009064 | -0.016966 | -0.016393 | -0.010031 | -0.034307 | -0.023447 | 0.048012 | -0.048012 | -0.526728 | -0.753167 | 1.000000 | 0.013691 | 0.000761 | -0.006899 | 0.016067 | -0.016067 | -0.045516 | 0.056712 | -0.020043 | 0.000447 | -0.000447 |
| BusinessUnit_BUSINESS OPERATIONS | 0.013340 | 0.008185 | -0.025660 | 0.005548 | 0.015743 | -0.015773 | 0.008964 | -0.011527 | 0.005173 | -0.043107 | 0.042916 | 0.008821 | -0.038067 | -0.025815 | -0.029313 | -0.013888 | 0.013888 | -0.006882 | -0.010567 | 0.013691 | 1.000000 | -0.294701 | -0.140988 | -0.027211 | 0.027211 | 0.012659 | 0.041183 | -0.055247 | 0.009318 | -0.009318 |
| BusinessUnit_CONSULTANTS | 0.018257 | -0.018363 | -0.001730 | -0.065200 | 0.020984 | 0.024427 | 0.026915 | 0.005317 | 0.010337 | -0.005280 | -0.068954 | -0.032482 | -0.027790 | -0.020657 | -0.013334 | 0.083771 | -0.083771 | -0.003577 | 0.001885 | 0.000761 | -0.294701 | 1.000000 | -0.904495 | -0.016969 | 0.016969 | 0.035808 | -0.021236 | -0.009199 | 0.003433 | -0.003433 |
| BusinessUnit_SALES | -0.024868 | 0.015371 | 0.013245 | 0.065073 | -0.028766 | -0.018267 | -0.031885 | -0.000364 | -0.013018 | 0.024709 | 0.052284 | 0.029715 | 0.045780 | 0.032922 | 0.026897 | -0.080591 | 0.080591 | 0.006777 | 0.002763 | -0.006899 | -0.140988 | -0.904495 | 1.000000 | 0.029725 | -0.029725 | -0.042748 | 0.003620 | 0.034187 | -0.007715 | 0.007715 |
| Gender_FEMALE | 0.035255 | 0.015143 | -0.034995 | 0.030716 | 0.038546 | -0.001496 | 0.013854 | 0.020590 | 0.046452 | 0.037716 | 0.000967 | 0.030275 | 0.042457 | 0.027667 | 0.029741 | 0.031381 | -0.031381 | -0.052396 | 0.021897 | 0.016067 | -0.027211 | -0.016969 | 0.029725 | 1.000000 | -1.000000 | -0.047877 | 0.014783 | 0.026832 | -0.040853 | 0.040853 |
| Gender_MALE | -0.035255 | -0.015143 | 0.034995 | -0.030716 | -0.038546 | 0.001496 | -0.013854 | -0.020590 | -0.046452 | -0.037716 | -0.000967 | -0.030275 | -0.042457 | -0.027667 | -0.029741 | -0.031381 | 0.031381 | 0.052396 | -0.021897 | -0.016067 | 0.027211 | 0.016969 | -0.029725 | -1.000000 | 1.000000 | 0.047877 | -0.014783 | -0.026832 | 0.040853 | -0.040853 |
| MaritalStatus_DIVORCED | 0.036180 | -0.000975 | -0.016284 | 0.032694 | 0.040735 | -0.022812 | -0.011341 | 0.009316 | 0.038571 | 0.008200 | -0.007277 | 0.027088 | 0.019581 | -0.005866 | 0.015617 | 0.087443 | -0.087443 | 0.061997 | 0.004856 | -0.045516 | 0.012659 | 0.035808 | -0.042748 | -0.047877 | 0.047877 | 1.000000 | -0.490612 | -0.366270 | -0.024628 | 0.024628 |
| MaritalStatus_MARRIED | 0.083140 | -0.003193 | -0.012484 | 0.055860 | -0.016280 | 0.022302 | 0.009535 | -0.009405 | 0.053597 | -0.030103 | -0.008322 | 0.045632 | 0.066392 | 0.054600 | 0.032276 | 0.092399 | -0.092399 | -0.045584 | -0.030554 | 0.056712 | 0.041183 | -0.021236 | 0.003620 | 0.014783 | -0.014783 | -0.490612 | 1.000000 | -0.631129 | 0.013796 | -0.013796 |
| MaritalStatus_SINGLE | -0.120988 | 0.004278 | 0.027827 | -0.088754 | -0.018877 | -0.003509 | -0.000086 | 0.001750 | -0.091569 | 0.024847 | 0.015365 | -0.072841 | -0.088328 | -0.053083 | -0.048368 | -0.176509 | 0.176509 | -0.006511 | 0.028304 | -0.020043 | -0.055247 | -0.009199 | 0.034187 | 0.026832 | -0.026832 | -0.366270 | -0.631129 | 1.000000 | 0.007192 | -0.007192 |
| OverTime_NO | -0.031085 | 0.020566 | -0.022657 | -0.007805 | 0.020671 | 0.002724 | -0.003784 | -0.895522 | -0.015633 | 0.077874 | 0.028260 | 0.010267 | 0.029274 | 0.012202 | 0.040563 | 0.245917 | -0.245917 | 0.036962 | -0.029124 | 0.000447 | 0.009318 | 0.003433 | -0.007715 | -0.040853 | 0.040853 | -0.024628 | 0.013796 | 0.007192 | 1.000000 | -1.000000 |
| OverTime_YES | 0.031085 | -0.020566 | 0.022657 | 0.007805 | -0.020671 | -0.002724 | 0.003784 | 0.895522 | 0.015633 | -0.077874 | -0.028260 | -0.010267 | -0.029274 | -0.012202 | -0.040563 | -0.245917 | 0.245917 | -0.036962 | 0.029124 | -0.000447 | -0.009318 | -0.003433 | 0.007715 | 0.040853 | -0.040853 | 0.024628 | -0.013796 | -0.007192 | -1.000000 | 1.000000 |
Hypothesis 1: That employees that resign spend a shorter number of years at the company.
The below scatterplot shows a comparison between resignation status and time spent at the company. As expected, employees that did not resign have spent a longer amount of years in the role and also have a higher density of points above the period of 10 years at the company.
#https://plotly.com/python/line-and-scatter/#scatter-plots-and-categorical-axes
#https://plotly.com/python/figure-labels/
fig = px.scatter(onehot, x="YearsAtCompany", y="YearsInRole", color="Resigned_YES", facet_col="Resigned_YES",
labels = {"YearsAtCompany": "Yrs at Company", "YearsInRole": "Yrs in Role",
"Resigned": "Resigned"},
title = "Years at Company vs Years in Role by Resignation Status")
fig.show()
Hypothesis 2: That employees with higher performance ratings are most likely to get a higher pay rise.
The below plot shows the PercentSalaryHike variable with employees split into their various performance rating. As expected, employees with the highest performance ratings typically have the highest Percent Salary Hike metric. Interestingly, the difference between a performance rating of 3 and 4 is the difference between achieving a salary hike of 20% or above.
fig = px.ecdf(onehot, x= "PercentSalaryHike", color="PerformanceRating")
fig.show()